1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material

Data source:

https://www.kaggle.com/datasets/cdc/national-health-and-nutrition-examination-survey?select=labs.csv

Details:

https://www.cdc.gov/Nchs/Nhanes/about_nhanes.htm

demographic <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/demographic.xlsx"))
demographiDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/demographic.xlsx",sheet = "Sheet1"))

rownames(demographic) <- demographic$SEQN
demographic$SEQN <- NULL

vartokeep <- demographiDesc[demographiDesc$Keep=="Yes",1]
demographic <- demographic[,vartokeep]

keepColumn <- apply(!is.na(demographic),2,sum)/nrow(demographic) > 0.75
demographic <- demographic[,keepColumn]
demographic <- demographic[complete.cases(demographic),]

## Examination data
examination <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/examination.xlsx"))
examinationDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/examination.xlsx",sheet = "Sheet1"))
rownames(examination) <- examination$SEQN
examination$SEQN <- NULL

bpresDI <- examination[,c("BPXDI1","BPXDI2","BPXDI3","BPXDI4")]
examination[,c("BPXDI1","BPXDI2","BPXDI3","BPXDI4")] <- NULL
bpresDIM <- apply(bpresDI,1,mean,na.rm=TRUE)
summary(bpresDIM)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>    0.00   58.00   66.67   65.44   74.67  128.00    2282

bpresSY <- examination[,c("BPXSY1","BPXSY2","BPXSY3","BPXSY4")]
examination[,c("BPXSY1","BPXSY2","BPXSY3","BPXSY4")] <- NULL
bpresSYM <- apply(bpresSY,1,mean,na.rm=TRUE)
summary(bpresSYM)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>   64.67  106.00  115.33  118.31  128.00  228.67    2282

examination$PEASCST1 <- NULL
examination$bpresDIM <- bpresDIM
examination$bpresSYM <- bpresSYM
  
keepColumn <- apply(!is.na(examination),2,sum)/nrow(examination) > 0.75
examination <- examination[,keepColumn]
keepColumnVar <- !is.na(apply(examination,2,var,na.rm = TRUE))
examination <- examination[,keepColumnVar]
examination <- examination[complete.cases(examination),]
examination <- examination[examination$bpresDIM>0,]



## questionnaire data
questionnaire <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/questionnaire.xlsx"))
questionnaireDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/questionnaire.xlsx",sheet = "Sheet1"))
rownames(questionnaire) <- questionnaire$SEQN
questionnaire$SEQN <- NULL
table(questionnaire$MCQ010)
#> 
#>    1    2    7    9 
#> 1538 8222    1    8

#keepColumn <- apply(!is.na(questionnaire),2,sum)/nrow(questionnaire) > 0.85
#questionnaire <- questionnaire[,keepColumn]
#keepColumnVar <- !is.na(apply(questionnaire,2,var,na.rm = TRUE))
#questionnaire <- questionnaire[,keepColumnVar]
#questionnaire <- questionnaire[complete.cases(questionnaire),]
#keeprow <- apply(questionnaire,2,max) < 9000
#questionnaire <- questionnaire[keeprow,]
#table(questionnaire$MCQ010)

## Diabetes
Diabetes <- questionnaire$MCQ010
names(Diabetes) <- rownames(questionnaire) 
Diabetes <- Diabetes[!is.na(Diabetes)]
Diabetes <- Diabetes[Diabetes<3]
table(Diabetes)
#> Diabetes
#>    1    2 
#> 1538 8222

## Depression 
depression <- questionnaire[,c("DPQ010","DPQ020","DPQ030","DPQ040","DPQ050","DPQ060","DPQ070","DPQ080","DPQ090")]
depression[depression>5] <- NA
depressionTot <- apply(depression,1,sum,na.rm=TRUE)
donnow <- apply(is.na(depression),1,sum) == 9
depressionTot <- depressionTot[!donnow]
table(depressionTot)
#> depressionTot
#>    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
#> 1718  852  631  470  360  233  214  162  137  104   90   66   53   51   57   31 
#>   16   17   18   19   20   21   22   23   24   25   27 
#>   39   26   27   19   16   12   11    4    7    4    3

## The labs data
labs <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/labs.xlsx"))
labsDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/labs.xlsx",sheet = "Sheet1"))

rownames(labs) <- labs$SEQN
labs$SEQN <- NULL

keepColumn <- apply(!is.na(labs),2,sum)/nrow(labs) > 0.75
labs <- labs[,keepColumn]
keepColumnVar <- !is.na(apply(labs,2,var,na.rm = TRUE))
labs <- labs[,keepColumnVar]
labs <- labs[complete.cases(labs),]

includeIDS <- rownames(labs)[rownames(labs) %in% rownames(examination)]
includeIDS <- includeIDS[includeIDS %in% rownames(demographic)]

includeIDS <- includeIDS[includeIDS %in% names(Diabetes)]

Diab <- Diabetes[includeIDS]
table(Diab)
#> Diab
#>    1    2 
#>  992 5006

NHNES <- cbind(demographic[includeIDS,],examination[includeIDS,],labs[includeIDS,]) 
NHNES$Diab <- Diab

table(NHNES$Diab)
#> 
#>    1    2 
#>  992 5006

## Only Adults in this experiment
NHNES <- NHNES[NHNES$RIDAGEYR >= 20,]

NHNES$Diab <- 1*(NHNES$Diab == 1)

table(NHNES$Diab)
#> 
#>    0    1 
#> 3664  650
includeIDS <- rownames(NHNES)
includeIDS <- includeIDS[includeIDS %in% names(depressionTot)]
depressionTot <- depressionTot[includeIDS]
NHNESDep <- cbind(NHNES[includeIDS,],depressionTot)

NHNESDep$depressionTot <- log(NHNESDep$depressionTot+1)

tbdep <- table(NHNESDep$depressionTot)
genderDiab <- NHNESDep[,c("RIAGENDR","Diab")]

isContinuous <- sapply(apply(NHNESDep,2,unique),length) > 4
sum(isContinuous)
#> [1] 63

NHNESDep <- cbind(NHNESDep[,isContinuous],genderDiab)

table(NHNESDep$RIAGENDR)
#> 
#>    1    2 
#> 2059 2094

colnames(NHNESDep)[sapply(apply(NHNESDep,2,unique),length) < 10]
#>  [1] "OHX04TC"  "OHX06TC"  "OHX10TC"  "OHX11TC"  "OHX13TC"  "OHX20TC" 
#>  [7] "OHX25TC"  "OHX29TC"  "LBDBANO"  "RIAGENDR" "Diab"

1.1.0.1 Standarize the names for the reporting

studyName <- "NHNES"
dataframe <- NHNESDep
outcome <- "RIDAGEYR"

thro <- 0.40
cexheat <- 0.35
loops <- 30

1.2 Generaring the report

1.2.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")
library(multiColl)
library(car)
library("pls")

source("C:/Users/jtame/Documents/GitHub/LatentBiomarkers/RMD/RepeatedLinearCV.R")

1.2.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
4153 64

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.2.3 Training and testing sets

set.seed(1)
trainsamples <- sample(nrow(dataframe),3*nrow(dataframe)/4)

trainingset <- dataframe[trainsamples,]
testingset <- dataframe[-trainsamples,]

pander::pander(t(summary(trainingset)))
             
RIDAGEYR Min. :20.00 1st Qu.:33.00 Median :47.00 Mean :48.03 3rd Qu.:62.00 Max. :80.00
WTINT2YR Min. : 5735 1st Qu.: 19354 Median : 28735 Mean : 41781 3rd Qu.: 59117 Max. :167885
WTMEC2YR Min. : 6107 1st Qu.: 19999 Median : 29753 Mean : 43054 3rd Qu.: 60384 Max. :171075
PEASCTM1 Min. : 13.0 1st Qu.: 664.0 Median : 755.0 Mean : 794.6 3rd Qu.: 899.0 Max. :2056.0
BPXPLS Min. : 42.00 1st Qu.: 64.00 Median : 72.00 Mean : 72.51 3rd Qu.: 80.00 Max. :140.00
BPXML1 Min. :110.0 1st Qu.:140.0 Median :140.0 Mean :147.8 3rd Qu.:160.0 Max. :220.0
BMXWT Min. : 32.80 1st Qu.: 66.80 Median : 78.80 Mean : 81.67 3rd Qu.: 92.90 Max. :195.40
BMXHT Min. :140.9 1st Qu.:160.3 Median :167.5 Mean :167.8 3rd Qu.:175.1 Max. :202.6
BMXBMI Min. :14.10 1st Qu.:24.20 Median :27.90 Mean :28.94 3rd Qu.:32.30 Max. :64.70
BMXLEG Min. :26.00 1st Qu.:36.60 Median :39.00 Mean :39.06 3rd Qu.:41.60 Max. :51.90
BMXARML Min. :28.70 1st Qu.:35.30 Median :37.30 Mean :37.36 3rd Qu.:39.30 Max. :46.50
BMXARMC Min. :19.00 1st Qu.:29.60 Median :32.80 Mean :33.13 3rd Qu.:36.10 Max. :57.00
BMXWAIST Min. : 55.50 1st Qu.: 87.53 Median : 97.80 Mean : 99.12 3rd Qu.:108.47 Max. :164.20
MGXH1T1 Min. : 6.50 1st Qu.:25.00 Median :31.00 Mean :32.68 3rd Qu.:40.00 Max. :81.50
MGXH2T1 Min. : 5.80 1st Qu.:25.40 Median :31.80 Mean :33.41 3rd Qu.:40.90 Max. :79.50
MGXH1T2 Min. : 5.2 1st Qu.:26.1 Median :32.4 Mean :34.3 3rd Qu.:41.8 Max. :75.9
MGXH2T2 Min. : 6.00 1st Qu.:26.40 Median :33.00 Mean :34.79 3rd Qu.:42.60 Max. :81.40
MGXH1T3 Min. : 9.20 1st Qu.:26.60 Median :33.10 Mean :34.96 3rd Qu.:42.50 Max. :81.20
MGXH2T3 Min. : 6.60 1st Qu.:26.43 Median :33.30 Mean :35.17 3rd Qu.:43.10 Max. :82.80
MGDCGSZ Min. : 17.90 1st Qu.: 55.30 Median : 68.25 Mean : 72.10 3rd Qu.: 88.25 Max. :162.80
OHX04TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.494 3rd Qu.:2.000 Max. :5.000
OHX06TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.294 3rd Qu.:2.000 Max. :5.000
OHX10TC Min. :2.000 1st Qu.:2.000 Median :2.000 Mean :2.357 3rd Qu.:2.000 Max. :5.000
OHX11TC Min. :1.0 1st Qu.:2.0 Median :2.0 Mean :2.3 3rd Qu.:2.0 Max. :5.0
OHX13TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.501 3rd Qu.:2.000 Max. :5.000
OHX20TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.367 3rd Qu.:2.000 Max. :5.000
OHX25TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.215 3rd Qu.:2.000 Max. :5.000
OHX29TC Min. :1.000 1st Qu.:2.000 Median :2.000 Mean :2.375 3rd Qu.:2.000 Max. :5.000
bpresDIM Min. : 11.33 1st Qu.: 63.33 Median : 70.67 Mean : 70.00 3rd Qu.: 76.67 Max. :128.00
bpresSYM Min. : 79.33 1st Qu.:110.67 Median :120.00 Mean :122.63 3rd Qu.:132.00 Max. :212.00
URXUMA Min. : 0.21 1st Qu.: 4.10 Median : 7.70 Mean : 44.17 3rd Qu.: 16.20 Max. :9600.00
URXUMS Min. : 0.21 1st Qu.: 4.10 Median : 7.70 Mean : 44.17 3rd Qu.: 16.20 Max. :9600.00
URXUCR.x Min. : 8.0 1st Qu.: 59.0 Median :106.0 Mean :120.4 3rd Qu.:162.0 Max. :659.0
URXCRS Min. : 707.2 1st Qu.: 5215.6 Median : 9370.4 Mean :10643.3 3rd Qu.:14320.8 Max. :58255.6
URDACT Min. : 0.26 1st Qu.: 4.74 Median : 7.14 Mean : 44.65 3rd Qu.: 13.20 Max. :9000.00
LBXWBCSI Min. : 2.600 1st Qu.: 5.700 Median : 7.000 Mean : 7.303 3rd Qu.: 8.400 Max. :55.700
LBXLYPCT Min. : 4.00 1st Qu.:24.90 Median :30.00 Mean :30.52 3rd Qu.:35.60 Max. :88.00
LBXMOPCT Min. : 1.300 1st Qu.: 6.700 Median : 7.900 Mean : 8.191 3rd Qu.: 9.400 Max. :38.900
LBXNEPCT Min. : 8.80 1st Qu.:51.90 Median :58.40 Mean :57.87 3rd Qu.:64.00 Max. :90.80
LBXEOPCT Min. : 0.000 1st Qu.: 1.500 Median : 2.200 Mean : 2.739 3rd Qu.: 3.400 Max. :28.300
LBXBAPCT Min. :0.0000 1st Qu.:0.5000 Median :0.7000 Mean :0.7408 3rd Qu.:0.9000 Max. :5.8000
LBDLYMNO Min. : 0.40 1st Qu.: 1.70 Median : 2.10 Mean : 2.18 3rd Qu.: 2.60 Max. :49.00
LBDMONO Min. :0.1000 1st Qu.:0.4000 Median :0.6000 Mean :0.5807 3rd Qu.:0.7000 Max. :3.4000
LBDNENO Min. : 0.400 1st Qu.: 3.100 Median : 4.000 Mean : 4.297 3rd Qu.: 5.100 Max. :15.800
LBDEONO Min. :0.0000 1st Qu.:0.1000 Median :0.2000 Mean :0.1965 3rd Qu.:0.2000 Max. :3.1000
LBDBANO Min. :0.00000 1st Qu.:0.00000 Median :0.00000 Mean :0.04672 3rd Qu.:0.10000 Max. :0.80000
LBXRBCSI Min. :2.890 1st Qu.:4.360 Median :4.650 Mean :4.672 3rd Qu.:4.970 Max. :8.300
LBXHGB Min. : 6.40 1st Qu.:13.10 Median :14.10 Mean :14.07 3rd Qu.:15.10 Max. :18.90
LBXHCT Min. :21.70 1st Qu.:38.90 Median :41.60 Mean :41.58 3rd Qu.:44.30 Max. :55.90
LBXMCVSI Min. : 55.70 1st Qu.: 86.40 Median : 89.60 Mean : 89.22 3rd Qu.: 92.80 Max. :108.70
LBXMCHSI Min. :16.80 1st Qu.:29.10 Median :30.40 Mean :30.19 3rd Qu.:31.60 Max. :46.40
LBXMC Min. :28.00 1st Qu.:33.30 Median :33.80 Mean :33.81 3rd Qu.:34.40 Max. :44.90
LBXRDW Min. :11.70 1st Qu.:13.00 Median :13.40 Mean :13.66 3rd Qu.:14.00 Max. :26.10
LBXPLTSI Min. : 18 1st Qu.:196 Median :229 Mean :236 3rd Qu.:271 Max. :633
LBXMPSI Min. : 6.200 1st Qu.: 7.800 Median : 8.400 Mean : 8.454 3rd Qu.: 9.000 Max. :13.000
PHAFSTHR.x Min. : 0 1st Qu.: 2 Median : 7 Mean : 7 3rd Qu.:11 Max. :35
PHAFSTMN.x Min. : 0.00 1st Qu.:14.00 Median :30.00 Mean :29.39 3rd Qu.:44.00 Max. :59.00
LBDHDD Min. : 10.00 1st Qu.: 42.00 Median : 50.00 Mean : 52.71 3rd Qu.: 62.00 Max. :173.00
LBDHDDSI Min. :0.260 1st Qu.:1.090 Median :1.290 Mean :1.363 3rd Qu.:1.600 Max. :4.470
LBXTC Min. : 82.0 1st Qu.:162.0 Median :187.0 Mean :190.5 3rd Qu.:214.0 Max. :813.0
LBDTCSI Min. : 2.120 1st Qu.: 4.190 Median : 4.840 Mean : 4.926 3rd Qu.: 5.530 Max. :21.020
URXVOL1 Min. : 2.0 1st Qu.: 46.0 Median : 88.0 Mean :110.8 3rd Qu.:153.8 Max. :552.0
depressionTot Min. :0.000 1st Qu.:0.000 Median :1.099 Mean :1.022 3rd Qu.:1.609 Max. :3.332
RIAGENDR Min. :1.000 1st Qu.:1.000 Median :2.000 Mean :1.509 3rd Qu.:2.000 Max. :2.000
Diab Min. :0.0000 1st Qu.:0.0000 Median :0.0000 Mean :0.1493 3rd Qu.:0.0000 Max. :1.0000

varlist <- colnames(trainingset)
varlist <- varlist[varlist != outcome]

1.2.4 Correlation Matrix of the Data

The heat map of the data

par(op)


  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  cormat <- cor(testingset[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  pander::pander(max(abs(cormat)))

1

par(op)

1.2.5 Decorrelating using ILAA

ILAA bootstrapped training and testing sets

trainage_DE <- ILAA(trainingset,thr=thro,Outcome=outcome,verbose=TRUE)

fast | LM | URXUMA URXUCR.x PHAFSTMN.x WTINT2YR WTMEC2YR PEASCTM1 BPXPLS BPXML1 BMXWT 0.86885246 0.85245902 0.08196721 0.06557377 0.52459016 0.59016393

Included: 61 , Uni p: 0.002459016 , Base Size: 3 , Rcrit: 0.05038592

1 <R=0.835,thr=0.950>, Top: 7< 1 >Fa= 7,<|><>Tot Used: 18 , Added: 11 , Zero Std: 0 , Max Cor: 1.000

2 <R=0.774,thr=0.950>, Top: 3< 1 >Fa= 8,<|><>Tot Used: 19 , Added: 3 , Zero Std: 0 , Max Cor: 0.938

3 <R=0.740,thr=0.900>, Top: 4< 1 >Fa= 11,<|><>Tot Used: 26 , Added: 4 , Zero Std: 0 , Max Cor: 0.895

4 <R=0.700,thr=0.800>, Top: 4< 1 >Fa= 14,<|><>Tot Used: 34 , Added: 5 , Zero Std: 0 , Max Cor: 0.973

5 <R=0.681,thr=0.950>, Top: 2< 1 >Fa= 16,<|><>Tot Used: 36 , Added: 2 , Zero Std: 0 , Max Cor: 0.789

6 <R=0.651,thr=0.700>, Top: 6< 2 >Fa= 18,<|><>Tot Used: 40 , Added: 7 , Zero Std: 0 , Max Cor: 0.914

7 <R=0.625,thr=0.900>, Top: 1< 1 >Fa= 19,<|><>Tot Used: 40 , Added: 1 , Zero Std: 0 , Max Cor: 0.770

8 <R=0.607,thr=0.700>, Top: 4< 1 >Fa= 19,<|><>Tot Used: 40 , Added: 3 , Zero Std: 0 , Max Cor: 0.697

9 <R=0.591,thr=0.600>, Top: 8< 2 >Fa= 24,<|><>Tot Used: 49 , Added: 8 , Zero Std: 0 , Max Cor: 0.631

10 <R=0.545,thr=0.600>, Top: 1< 1 >Fa= 24,<|><>Tot Used: 49 , Added: 1 , Zero Std: 0 , Max Cor: 0.892

11 <R=0.571,thr=0.800>, Top: 1< 1 >Fa= 24,<|><>Tot Used: 49 , Added: 1 , Zero Std: 0 , Max Cor: 0.748

12 <R=0.556,thr=0.700>, Top: 1< 1 >Fa= 24,<|><>Tot Used: 49 , Added: 1 , Zero Std: 0 , Max Cor: 0.722

13 <R=0.554,thr=0.700>, Top: 1< 1 >Fa= 24,<|><>Tot Used: 49 , Added: 1 , Zero Std: 0 , Max Cor: 0.839

14 <R=0.566,thr=0.800>, Top: 1< 1 >Fa= 24,<|><>Tot Used: 49 , Added: 1 , Zero Std: 0 , Max Cor: 0.600

15 <R=0.540,thr=0.500>, Top: 8< 3 >Fa= 25,<|><>Tot Used: 51 , Added: 16 , Zero Std: 0 , Max Cor: 0.640

16 <R=0.524,thr=0.600>, Top: 2< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 2 , Zero Std: 0 , Max Cor: 0.672

17 <R=0.508,thr=0.600>, Top: 1< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.566

18 <R=0.482,thr=0.500>, Top: 3< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 3 , Zero Std: 0 , Max Cor: 0.598

19 <R=0.483,thr=0.500>, Top: 1< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.673

20 <R=0.491,thr=0.600>, Top: 1< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.633

21 <R=0.484,thr=0.600>, Top: 1< 1 >Fa= 25,<|><>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.484

22 <R=0.454,thr=0.400>, Top: 7< 1 >Fa= 26,<|><>Tot Used: 53 , Added: 6 , Zero Std: 0 , Max Cor: 0.477

23 <R=0.459,thr=0.400>, Top: 1< 1 >Fa= 27,<|><>Tot Used: 53 , Added: 1 , Zero Std: 0 , Max Cor: 0.408

24 <R=0.408,thr=0.400>, Top: 1< 1 >Fa= 27,<|><>Tot Used: 53 , Added: 1 , Zero Std: 0 , Max Cor: 0.560

25 <R=0.560,thr=0.500>, Top: 1< 1 >Fa= 28,<|><>Tot Used: 53 , Added: 1 , Zero Std: 0 , Max Cor: 0.456

26 <R=0.456,thr=0.400>, Top: 1< 1 >Fa= 28,<|><>Tot Used: 53 , Added: 1 , Zero Std: 0 , Max Cor: 0.396

27 <R=0.396,thr=0.400>

[ 27 ], 0.4042229 Decor Dimension: 53 Nused: 53 . Cor to Base: 34 , ABase: 61 , Outcome Base: 0

#trainage_DE <- ILAA(trainingset,thr=thro,Outcome=outcome,verbose=TRUE,bootstrap=30)

testage_DE <- predictDecorrelate(trainage_DE,testingset)

1.2.6 The Formulas

Generating the formulas


theLaFormulas <- getLatentCoefficients(trainage_DE)

theCharformulas <- attr(theLaFormulas,"LatentCharFormulas")
pander::pander(as.matrix(theCharformulas))
La_WTMEC2YR - (1.025)WTINT2YR + WTMEC2YR
La_BMXWT + BMXWT - (2.759)BMXBMI - (0.325)MGDCGSZ
La_BMXHT - (0.969)BMXWT + BMXHT + (2.674)BMXBMI
La_BMXLEG - (0.142)BMXWT + (0.393)BMXBMI + BMXLEG - (0.206)BMXARML - (3.54e-03)MGDCGSZ
La_BMXARML - (0.212)BMXWT + (0.584)BMXBMI + BMXARML - (5.26e-03)MGDCGSZ
La_BMXARMC - (0.666)BMXBMI + BMXARMC - (0.066)MGDCGSZ
La_BMXWAIST - (0.494)BMXWT - (0.843)BMXBMI + BMXWAIST + (0.161)MGDCGSZ
La_MGXH1T1 + MGXH1T1 - (0.439)MGDCGSZ
La_MGXH2T1 + MGXH2T1 + (0.614)MGXH1T3 - (0.764)MGDCGSZ
La_MGXH1T2 + MGXH1T2 - (0.535)MGXH1T3 - (0.211)MGDCGSZ
La_MGXH2T2 + MGXH2T2 + (0.599)MGXH1T3 - (0.780)MGDCGSZ
La_MGXH1T3 + MGXH1T3 - (0.486)MGDCGSZ
La_MGXH2T3 + (0.571)MGXH1T3 + MGXH2T3 - (0.771)MGDCGSZ
La_OHX04TC + OHX04TC - (0.620)OHX13TC
La_OHX11TC - (0.795)OHX06TC + OHX11TC
La_OHX13TC - (0.708)OHX06TC + OHX13TC
La_OHX20TC + OHX20TC - (0.605)OHX29TC
La_OHX25TC - (0.489)OHX06TC + OHX25TC
La_OHX29TC - (0.616)OHX06TC + OHX29TC
La_bpresSYM - (0.911)BPXML1 + bpresSYM
La_URXUMS - URXUMA + URXUMS
La_URXCRS - (88.400)URXUCR.x + URXCRS
La_URDACT - (0.858)URXUMA + URDACT
La_LBXWBCSI + LBXWBCSI + (0.126)LBXNEPCT - (1.627)LBDNENO
La_LBXLYPCT + LBXLYPCT + (0.849)LBXNEPCT
La_LBXMOPCT + (0.989)LBXLYPCT + LBXMOPCT + (0.987)LBXNEPCT + (0.991)LBXEOPCT + (0.962)LBXBAPCT
La_LBXEOPCT + (0.443)LBXLYPCT + (0.376)LBXNEPCT + LBXEOPCT
La_LBDLYMNO - (0.947)LBXWBCSI - (0.074)LBXLYPCT - (0.075)LBXNEPCT - (0.061)LBXEOPCT + LBDLYMNO + (1.048)LBDNENO + (0.850)LBDEONO
La_LBDMONO - (0.587)LBXWBCSI + (0.013)LBXLYPCT + (0.011)LBXNEPCT + (0.018)LBXEOPCT + (0.577)LBDLYMNO + LBDMONO + (0.577)LBDNENO + (0.490)LBDEONO
La_LBDNENO - (0.128)LBXNEPCT + LBDNENO
La_LBDEONO + (4.59e-03)LBXNEPCT - (0.072)LBXEOPCT - (0.036)LBDNENO + LBDEONO
La_LBDBANO + (2.10e-03)LBXNEPCT - (0.100)LBXBAPCT - (0.016)LBDNENO + LBDBANO
La_LBXRBCSI + LBXRBCSI + (0.333)LBXHGB - (0.222)LBXHCT + (0.103)LBXMCVSI - (0.151)LBXMCHSI
La_LBXHGB + LBXHGB - (0.354)LBXHCT
La_LBXHCT - (0.083)MGDCGSZ + LBXHCT
La_LBXMCVSI + (5.969)LBXHGB - (2.022)LBXHCT + LBXMCVSI - (2.868)LBXMCHSI
La_LBXMCHSI - (3.528)LBXHGB + (1.248)LBXHCT + LBXMCHSI
La_LBXMC + (1.438)LBXRBCSI - (0.894)LBXHGB + (0.135)LBXHCT + (0.309)LBXMCVSI - (0.709)LBXMCHSI + LBXMC
La_LBXMPSI + (6.32e-03)LBXPLTSI + LBXMPSI
La_LBDHDDSI - (0.026)LBDHDD + LBDHDDSI
La_LBDTCSI - (0.026)LBXTC + LBDTCSI

1.2.7 Formulas Network

Displaying the features associations

par(op)

  transform <- attr(trainage_DE,"UPLTM") != 0
  colnames(transform) <- str_remove_all(colnames(transform),"La_")
  transform <- abs(transform*cor(trainingset[,rownames(transform)])) # The weights are proportional to the observed correlation
  
  
  VertexSize <- attr(trainage_DE,"fscore") # The size depends on the variable independence relevance (fscore)
  names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
  VertexSize <- 0.5+9.5*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization
  VertexSize <- VertexSize[colnames(transform)]
  gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
  gr$layout <- layout_with_fr
  
  fc <- cluster_optimal(gr)
#          fc <- cluster_walktrap (gr,steps=50)

  plot(fc, gr,
       edge.width = 2*E(gr)$weight,
       vertex.size=VertexSize,
       edge.arrow.size=0.5,
       edge.arrow.width=0.75,
       vertex.label.color="purple",
#       vertex.label.cex=0.85,
#       vertex.label.dist=1.2,
       vertex.label.cex=(0.70 + 0.025*VertexSize),
       vertex.label.dist=(0.5 + 0.05*VertexSize),

       
       main="Feature Association")


par(op)

      varratios <- attr(trainage_DE,"VarRatio")
      names(varratios) <- str_remove_all(names(varratios),"La_")
      fscores <- attr(trainage_DE,"fscore") 
      names(fscores) <- str_remove_all(names(fscores),"La_")
      clustable <- as.data.frame(cbind(Variable=fc$names,
                                       Formula=as.character(theCharformulas[paste("La_",fc$names,sep="")]),
                                       Cluster=fc$membership,
                                       ResidualVariance=round(varratios[fc$names],3),
                                       Fscore=round(fscores[fc$names],3)
                                       )
                                 )
      rownames(clustable) <- str_replace_all(rownames(clustable),"__","_")
      clustable$Variable <- NULL
      clustable$Cluster <- as.integer(clustable$Cluster)
      clustable$ResidualVariance <- as.numeric(clustable$ResidualVariance)
      clustable$Fscore <- as.numeric(clustable$Fscore)
      clustable <- clustable[order(-clustable$Fscore),]
      clustable <- clustable[order(-clustable$ResidualVariance),]
      clustable <- clustable[order(clustable$Cluster),]

pander::pander(as.matrix(clustable))
  Formula Cluster ResidualVariance Fscore
WTINT2YR NA 1 1.000 1
WTMEC2YR - (1.025)WTINT2YR + WTMEC2YR 1 0.002 -1
BPXML1 NA 2 1.000 1
bpresSYM - (0.911)BPXML1 + bpresSYM 2 0.215 -1
BMXBMI NA 3 1.000 3
BMXLEG - (0.142)BMXWT + (0.393)BMXBMI + BMXLEG - (0.206)BMXARML - (3.54e-03)MGDCGSZ 3 0.449 -2
BMXARML - (0.212)BMXWT + (0.584)BMXBMI + BMXARML - (5.26e-03)MGDCGSZ 3 0.397 0
BMXARMC - (0.666)BMXBMI + BMXARMC - (0.066)MGDCGSZ 3 0.148 -2
BMXWAIST - (0.494)BMXWT - (0.843)BMXBMI + BMXWAIST + (0.161)MGDCGSZ 3 0.127 -2
BMXWT + BMXWT - (2.759)BMXBMI - (0.325)MGDCGSZ 3 0.113 1
BMXHT - (0.969)BMXWT + BMXHT + (2.674)BMXBMI 3 0.053 -1
MGDCGSZ NA 4 1.000 10
MGXH1T1 + MGXH1T1 - (0.439)MGDCGSZ 4 0.119 -1
MGXH2T1 + MGXH2T1 + (0.614)MGXH1T3 - (0.764)MGDCGSZ 4 0.065 -2
MGXH1T3 + MGXH1T3 - (0.486)MGDCGSZ 4 0.058 3
MGXH1T2 + MGXH1T2 - (0.535)MGXH1T3 - (0.211)MGDCGSZ 4 0.057 -2
MGXH2T2 + MGXH2T2 + (0.599)MGXH1T3 - (0.780)MGDCGSZ 4 0.042 -2
MGXH2T3 + (0.571)MGXH1T3 + MGXH2T3 - (0.771)MGDCGSZ 4 0.042 -2
OHX06TC NA 5 1.000 4
OHX29TC - (0.616)OHX06TC + OHX29TC 5 0.688 0
OHX25TC - (0.489)OHX06TC + OHX25TC 5 0.673 -1
OHX13TC - (0.708)OHX06TC + OHX13TC 5 0.672 0
OHX20TC + OHX20TC - (0.605)OHX29TC 5 0.624 -1
OHX04TC + OHX04TC - (0.620)OHX13TC 5 0.606 -1
OHX11TC - (0.795)OHX06TC + OHX11TC 5 0.392 -1
URXUMA NA 6 1.000 3
URDACT - (0.858)URXUMA + URDACT 6 0.379 -1
URXUMS - URXUMA + URXUMS 6 0.000 -2
URXUCR.x NA 7 1.000 2
URXCRS - (88.400)URXUCR.x + URXCRS 7 0.000 -2
LBXNEPCT NA 8 1.000 6
LBXBAPCT NA 8 1.000 3
LBXEOPCT + (0.443)LBXLYPCT + (0.376)LBXNEPCT + LBXEOPCT 8 0.565 4
LBDNENO - (0.128)LBXNEPCT + LBDNENO 8 0.530 5
LBDBANO + (2.10e-03)LBXNEPCT - (0.100)LBXBAPCT - (0.016)LBDNENO + LBDBANO 8 0.405 -2
LBDEONO + (4.59e-03)LBXNEPCT - (0.072)LBXEOPCT - (0.036)LBDNENO + LBDEONO 8 0.135 -1
LBXWBCSI + LBXWBCSI + (0.126)LBXNEPCT - (1.627)LBDNENO 8 0.131 0
LBXLYPCT + LBXLYPCT + (0.849)LBXNEPCT 8 0.129 4
LBDMONO - (0.587)LBXWBCSI + (0.013)LBXLYPCT + (0.011)LBXNEPCT + (0.018)LBXEOPCT + (0.577)LBDLYMNO + LBDMONO + (0.577)LBDNENO + (0.490)LBDEONO 8 0.113 -5
LBDLYMNO - (0.947)LBXWBCSI - (0.074)LBXLYPCT - (0.075)LBXNEPCT - (0.061)LBXEOPCT + LBDLYMNO + (1.048)LBDNENO + (0.850)LBDEONO 8 0.006 -5
LBXMOPCT + (0.989)LBXLYPCT + LBXMOPCT + (0.987)LBXNEPCT + (0.991)LBXEOPCT + (0.962)LBXBAPCT 8 0.002 -9
LBXHCT - (0.083)MGDCGSZ + LBXHCT 9 0.797 3
LBXMCHSI - (3.528)LBXHGB + (1.248)LBXHCT + LBXMCHSI 9 0.658 3
LBXHGB + LBXHGB - (0.354)LBXHCT 9 0.068 4
LBXMC + (1.438)LBXRBCSI - (0.894)LBXHGB + (0.135)LBXHCT + (0.309)LBXMCVSI - (0.709)LBXMCHSI + LBXMC 9 0.016 -3
LBXRBCSI + LBXRBCSI + (0.333)LBXHGB - (0.222)LBXHCT + (0.103)LBXMCVSI - (0.151)LBXMCHSI 9 0.009 -4
LBXMCVSI + (5.969)LBXHGB - (2.022)LBXHCT + LBXMCVSI - (2.868)LBXMCHSI 9 0.007 -4
LBXPLTSI NA 10 1.000 1
LBXMPSI + (6.32e-03)LBXPLTSI + LBXMPSI 10 0.834 -1
LBDHDD NA 11 1.000 1
LBDHDDSI - (0.026)LBDHDD + LBDHDDSI 11 0.000 -1
LBXTC NA 12 1.000 1
LBDTCSI - (0.026)LBXTC + LBDTCSI 12 0.000 -1

1.2.8 Correlation Matrix of the Data

The heat map of the ILAA transformed data

par(op)

varlistDe <- colnames(trainage_DE)
varlistDe <- varlistDe[varlistDe != outcome]

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)

# Training    
  cormat <- cor(trainage_DE[,varlistDe],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Training: After ILAA Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  pander::pander(max(abs(cormat)))

0.726


  par(op)

# Testing

  cormat <- cor(testage_DE[,varlistDe],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Testing: After ILAA Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")

  diag(cormat) <- 0
  pander::pander(max(abs(cormat)))

0.895


  par(op)

  

1.2.9 Modeling

1.2.9.1 Modeling outcome using raw set

outcomeModel <- LASSO_1SE(formula(paste(outcome,"~.")),trainingset);
predOutcome <- predict(outcomeModel,testingset)
pander::pander(as.matrix(outcomeModel$coef))
(Intercept) 6.96e+00
WTMEC2YR -2.93e-06
PEASCTM1 1.23e-03
BPXPLS -1.54e-01
BPXML1 2.26e-01
BMXWT -1.25e-01
BMXBMI -7.52e-01
BMXLEG -8.79e-01
BMXARML 7.77e-01
BMXARMC 7.49e-02
BMXWAIST 5.94e-01
MGXH1T1 -3.79e-02
MGXH1T2 1.16e-01
MGXH2T2 4.81e-02
MGXH1T3 8.50e-02
MGXH2T3 1.71e-01
MGDCGSZ -4.16e-01
OHX04TC 1.19e+00
OHX06TC 6.25e-01
OHX10TC 3.47e-01
OHX11TC -3.77e-01
OHX13TC 1.17e+00
OHX20TC 1.39e+00
OHX25TC 3.99e-01
OHX29TC 4.90e-01
bpresDIM -4.88e-02
bpresSYM 5.32e-02
URXUMA 1.17e-04
URXUMS 2.76e-19
URXUCR.x -1.63e-02
URDACT -2.50e-03
LBXLYPCT -2.23e-01
LBXMOPCT 1.36e-01
LBXEOPCT -1.54e-01
LBXBAPCT 2.98e+00
LBDLYMNO 4.23e-01
LBDMONO 1.74e+00
LBDNENO -1.16e+00
LBDEONO 2.23e+00
LBXHGB -5.96e-01
LBXMCVSI 3.87e-01
LBXMC -5.36e-01
LBXRDW 9.77e-01
LBXPLTSI -2.59e-02
LBXMPSI -1.06e+00
PHAFSTHR.x -3.46e-02
PHAFSTMN.x -1.09e-04
LBDHDDSI 3.36e+00
LBXTC 2.48e-02
LBDTCSI 4.99e-02
URXVOL1 -1.22e-02
depressionTot -7.23e-01
RIAGENDR -6.57e+00
Diab -1.36e+00

1.2.9.2 Modeling outcome using decorrelated set

outcomeModel_DE <- LASSO_1SE(formula(paste(outcome,"~.")),trainage_DE);
predOutcome_DE <- predict(outcomeModel_DE,testage_DE)

pander::pander(as.matrix(outcomeModel_DE$coef),caption="ILAA Coef")
ILAA Coef
(Intercept) 1.07e+02
La_WTMEC2YR -6.94e-04
PEASCTM1 4.71e-04
BPXPLS -1.52e-01
BPXML1 2.74e-01
La_BMXWT 1.41e-01
BMXBMI 1.65e-01
La_BMXLEG -7.97e-01
La_BMXARML 6.46e-01
La_BMXARMC 1.23e-01
La_BMXWAIST 6.35e-01
La_MGXH2T1 5.55e-02
La_MGXH1T2 1.60e-01
La_MGXH2T2 2.30e-01
La_MGXH1T3 7.26e-02
La_MGXH2T3 3.78e-01
MGDCGSZ -2.50e-01
La_OHX04TC 9.76e-01
OHX06TC 2.50e+00
OHX10TC 5.19e-01
La_OHX13TC 1.61e+00
La_OHX20TC 1.08e+00
La_OHX25TC 1.99e-01
La_OHX29TC 1.10e+00
bpresDIM -2.77e-02
La_bpresSYM 1.67e-02
URXUMA -5.20e-04
URXUCR.x -1.09e-02
La_URDACT -1.56e-03
La_LBXWBCSI 4.36e-01
La_LBXLYPCT -3.44e-01
La_LBXMOPCT -1.10e+00
La_LBXEOPCT -2.46e-01
LBXBAPCT 2.22e+00
La_LBDNENO -6.71e-01
La_LBDEONO 3.94e+00
La_LBXHCT -9.63e-02
La_LBXMCHSI 9.92e-01
LBXRDW 9.89e-01
LBXPLTSI -1.68e-02
La_LBXMPSI -9.13e-01
LBDHDD 6.79e-02
La_LBDHDDSI 9.65e+00
LBXTC 1.80e-02
URXVOL1 -7.64e-03
depressionTot -4.40e-01
RIAGENDR -4.03e+00
Diab -9.02e-01
obsCoef <- getObservedCoef(trainage_DE,outcomeModel_DE)
pander::pander(as.matrix(obsCoef$coef),caption="ILAA Modeling")
ILAA Modeling
(Intercept) 1.07e+02
PEASCTM1 4.71e-04
BPXPLS -1.52e-01
OHX10TC 5.19e-01
bpresDIM -2.77e-02
LBXRDW 9.89e-01
URXVOL1 -7.64e-03
depressionTot -4.40e-01
RIAGENDR -4.03e+00
Diab -9.02e-01
WTINT2YR 7.11e-04
WTMEC2YR -6.94e-04
BPXML1 2.59e-01
BMXWT -1.95e-01
BMXBMI -7.77e-01
BMXLEG -7.97e-01
BMXARML 8.11e-01
BMXARMC 1.23e-01
BMXWAIST 6.35e-01
MGXH2T1 5.55e-02
MGXH1T2 1.60e-01
MGXH2T2 2.30e-01
MGXH1T3 3.75e-01
MGXH2T3 3.78e-01
MGDCGSZ -7.78e-01
OHX04TC 9.76e-01
OHX06TC 5.81e-01
OHX13TC 1.01e+00
OHX20TC 1.08e+00
OHX25TC 1.99e-01
OHX29TC 4.52e-01
bpresSYM 1.67e-02
URXUMA 8.18e-04
URXUCR.x -1.09e-02
URDACT -1.56e-03
LBXWBCSI 4.36e-01
LBXLYPCT -1.54e+00
LBXMOPCT -1.10e+00
LBXNEPCT -1.31e+00
LBXEOPCT -1.62e+00
LBXBAPCT 1.16e+00
LBDNENO -1.52e+00
LBDEONO 3.94e+00
LBXHGB -3.50e+00
LBXHCT 1.14e+00
LBXMCHSI 9.92e-01
LBXPLTSI -2.26e-02
LBXMPSI -9.13e-01
LBDHDD -1.82e-01
LBDHDDSI 9.65e+00
LBXTC 1.80e-02


pander::pander(cor.test(predOutcome,testingset[,outcome]),caption="Raw Model")
Raw Model
Test statistic df P value Alternative hypothesis cor
41.3 1037 5.48e-221 * * * two.sided 0.788
pander::pander(cor.test(predOutcome_DE,testage_DE[,outcome]),caption="ILAA-based Model")
ILAA-based Model
Test statistic df P value Alternative hypothesis cor
41 1037 2.5e-219 * * * two.sided 0.787

1.2.9.3 Univariate t-test


rawunittvalues <- apply(as.matrix(testingset[,names(outcomeModel$coef)[-1]]),2,tvals,testingset[,outcome])
names(rawunittvalues) <- names(outcomeModel$coef)[-1]

deunittvalues <- apply(testage_DE[,names(outcomeModel_DE$coef)[-1]],2,tvals,testingset[,outcome])

1.2.9.4 Comparing summaries


psig <- 0.1/(ncol(testingset)-1)
lmod <- lm(paste(outcome,"~."),testingset[,c(outcome,names(outcomeModel$coef)[-1])])
try(vifx <-vif(lm(paste(outcome,"~."),testingset[,c(outcome,names(outcomeModel$coef)[-1])])))

Error in vif.default(lm(paste(outcome, “~.”), testingset[, c(outcome, : there are aliased coefficients in the model

sm <- summary(lmod)
if (length(lmod$coef)>10)
{
  sm$coefficients[1,4] <- 1.0
  gcoef <- lmod$coef[sm$coefficients[,4]<psig]
  lmod <- lm(paste(outcome,"~."),testingset[,c(outcome,names(gcoef))])
  try(vifx <-vif(lm(paste(outcome,"~."),testingset[,c(outcome,names(gcoef))])))
}



sm <- summary(lmod)
smcoef <- as.data.frame(sm$coefficients)
smcoef <- smcoef[order(-abs(smcoef[,3])),]
smcoef$Uni_t_values <- rawunittvalues[rownames(smcoef)]
if (!inherits(vif,"try-error")) smcoef$vif <-vifx[rownames(smcoef)]
smcoef <- smcoef[!is.na(smcoef$Uni_t_values),]
if (nrow(smcoef)>10) smcoef <- smcoef[smcoef[,4]<psig,]

pander::pander(smcoef)
  Estimate Std. Error t value Pr(>|t|) Uni_t_values vif
BPXML1 0.4019 0.02524 15.93 3.45e-51 17.362 1.09
BMXARML 1.6152 0.22316 7.24 8.90e-13 -0.334 2.52
BMXLEG -1.0315 0.16499 -6.25 5.95e-10 -10.171 2.48
MGDCGSZ -0.4653 0.07660 -6.07 1.75e-09 -10.525 17.56
BPXPLS -0.2167 0.03590 -6.04 2.19e-09 -5.817 1.09
LBXEOPCT 2.0961 0.41926 5.00 6.75e-07 3.605 5.76
LBXPLTSI -0.0351 0.00703 -5.00 6.84e-07 -5.049 1.17
LBDEONO -18.7655 4.79578 -3.91 9.72e-05 1.585 5.79
BMXWAIST 0.1054 0.02972 3.55 4.07e-04 5.222 1.44
pander::pander(t(c(R2=sm$r.squared,adj_R2=sm$adj.r.squared)))
R2 adj_R2
0.451 0.444
pander::pander(c(numvar=nrow(smcoef)))
numvar
9


lmod_DE <- lm(paste(outcome,"~."),testage_DE[,c(outcome,names(outcomeModel_DE$coef)[-1])])
try(vifx <-vif(lm(paste(outcome,"~."),testage_DE[,c(outcome,names(outcomeModel_DE$coef)[-1])])))

sm <- summary(lmod_DE)
if (length(lmod_DE$coef)>10)
{
  sm$coefficients[1,4] <- 1.0
  gcoef <- lmod_DE$coef[sm$coefficients[,4]<psig]
  lmod_DE <- lm(paste(outcome,"~."),testage_DE[,c(outcome,names(gcoef))])
  try(vifx <-vif(lm(paste(outcome,"~."),testage_DE[,c(outcome,names(gcoef))])))
}

sm <- summary(lmod_DE)
lacoef <- as.data.frame(sm$coefficients)
lacoef <- lacoef[order(-abs(lacoef[,3])),]
lacoef$Uni_t_values <- deunittvalues[rownames(lacoef)]
if (!inherits(vifx,"try-error")) lacoef$vif <-vifx[rownames(lacoef)]
lacoef <- lacoef[!is.na(lacoef$Uni_t_values),]
if (nrow(lacoef)>10) lacoef <- lacoef[lacoef[,4]<psig,]

lacoef$formula <- theCharformulas[rownames(lacoef)]
lacoef$VarRatio <- varratios[str_remove_all(rownames(lacoef),"La_")]

pander::pander(lacoef)
  Estimate Std. Error t value Pr(>|t|) Uni_t_values vif formula VarRatio
MGDCGSZ -0.38256 0.025366 -15.08 1.49e-46 -10.525 2.52 NA 1.00000
BPXML1 0.31352 0.022827 13.73 1.62e-39 17.362 1.16 NA 1.00000
La_BMXWAIST 0.61538 0.066152 9.30 8.10e-20 10.656 1.29 - (0.494)BMXWT - (0.843)BMXBMI + BMXWAIST + (0.161)MGDCGSZ 0.12686
RIAGENDR -8.59284 1.188495 -7.23 9.45e-13 -0.461 2.88 NA 1.00000
La_BMXLEG -0.99983 0.158900 -6.29 4.64e-10 -12.128 1.38 - (0.142)BMXWT + (0.393)BMXBMI + BMXLEG - (0.206)BMXARML - (3.54e-03)MGDCGSZ 0.44888
La_LBXLYPCT -0.73278 0.120970 -6.06 1.94e-09 -8.066 1.12 + LBXLYPCT + (0.849)LBXNEPCT 0.12936
La_LBXHCT -0.61927 0.110855 -5.59 2.98e-08 1.480 1.32 - (0.083)MGDCGSZ + LBXHCT 0.79676
La_LBXMCHSI 0.93501 0.180333 5.18 2.60e-07 6.218 1.20 - (3.528)LBXHGB + (1.248)LBXHCT + LBXMCHSI 0.65840
BPXPLS -0.15626 0.031602 -4.94 8.92e-07 -5.817 1.11 NA 1.00000
La_BMXARML 1.02170 0.212415 4.81 1.74e-06 5.709 1.19 - (0.212)BMXWT + (0.584)BMXBMI + BMXARML - (5.26e-03)MGDCGSZ 0.39663
La_LBDNENO -1.46350 0.317285 -4.61 4.48e-06 -5.685 1.24 - (0.128)LBXNEPCT + LBDNENO 0.53015
La_OHX20TC 2.53157 0.549632 4.61 4.62e-06 6.826 1.04 + OHX20TC - (0.605)OHX29TC 0.62438
La_WTMEC2YR -0.00102 0.000248 -4.13 3.94e-05 -4.503 1.02 - (1.025)WTINT2YR + WTMEC2YR 0.00232
La_LBXMPSI -1.48586 0.416317 -3.57 3.75e-04 -4.351 1.07 + (6.32e-03)LBXPLTSI + LBXMPSI 0.83438
LBXTC 0.03234 0.009192 3.52 4.54e-04 3.650 1.11 NA 1.00000
LBXPLTSI -0.02216 0.006418 -3.45 5.77e-04 -5.049 1.28 NA 1.00000
pander::pander(t(c(R2=sm$r.squared,adj_R2=sm$adj.r.squared)))
R2 adj_R2
0.582 0.575
pander::pander(c(numvar=nrow(lacoef)))
numvar
16


xvals <-c(min(c(deunittvalues,rawunittvalues))-3,max(c(deunittvalues,rawunittvalues))+3)

par(mfrow=c(1,2),cex=0.5)

plot(smcoef[,c(3,5)],
     main="Raw: Univariate t-values vs regression t-values",
     xlim=xvals,
     ylim=xvals
     )

lmtvals <- lm(smcoef[,5]~smcoef[,3])
pred <- lmtvals$coefficients[1] + lmtvals$coefficients[2] * xvals
lines(x=xvals,y=pred,col="red")
text(xvals[1]+(xvals[2]-xvals[1])/2,xvals[2]-1,sprintf("Slope= %.2f",lmtvals$coefficients[2]))


plot(lacoef[-1,c(3,5)],
     main="ILAA: Univariate t-values vs regression t-values",
     xlim=xvals,
     ylim=xvals
     )

lmtvals <- lm(lacoef[,5]~lacoef[,3])
pred <- lmtvals$coefficients[1] + lmtvals$coefficients[2] * xvals
lines(x=xvals,y=pred,col="red")
text(xvals[1]+(xvals[2]-xvals[1])/2,xvals[2]-1,sprintf("Slope= %.2f",lmtvals$coefficients[2]))


#pander::pander(summary(lmtvals))


pander::pander(cor.test(smcoef[,3],smcoef[,5]))
Pearson’s product-moment correlation: smcoef[, 3] and smcoef[, 5]
Test statistic df P value Alternative hypothesis cor
5.34 7 0.00108 * * two.sided 0.896

pander::pander(cor.test(lacoef[,3],lacoef[,5]))
Pearson’s product-moment correlation: lacoef[, 3] and lacoef[, 5]
Test statistic df P value Alternative hypothesis cor
8.36 14 8.22e-07 * * * two.sided 0.913

par(op)

1.2.9.5 Ploting predictions

par(mfrow=c(1,3),cex=0.5)
plot(lmod$fitted.values,predOutcome,main="Raw: lm train predict vs. test predict",xlab="Train",ylab="Test")
plot(lmod_DE$fitted.values,predOutcome_DE,main="ILAA: lm train predict vs. test predict",xlab="Train",ylab="Test")

plot(predOutcome,predOutcome_DE,xlab="Raw Predicted",ylab="ILAA Predicted",main="Raw vs. ILAA")


par(op)

1.2.10 CV

1.2.10.1 test Correlations

par(op)
corresults <- CV_IDeA(dataframe,outcome,loops=loops)

..Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model ..Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model Error in vif.default(lm(formula(frm), testage_DE)) : there are aliased coefficients in the model . .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model Error in vif.default(lm(formula(frm), testage_DE)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model …Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model . Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model Error in vif.default(lm(formula(frm), testage_DE)) : there are aliased coefficients in the model ..Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model Error in vif.default(lm(formula(frm), testage_DE)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model Error in vif.default(lm(formula(frm), testage_DE)) : there are aliased coefficients in the model .Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model .. Error in vif.default(lm(formula(frm), testingset)) : there are aliased coefficients in the model


mintvals <- min(c(min(corresults$rawtValues),min(corresults$detValues)))
maxvals <- max(c(max(corresults$rawtValues),max(corresults$detValues)))
xvals <- c(mintvals,maxvals)

vioplot(list(raw=corresults$testRawCorrelations,ILAA=corresults$testDeCorrelations),
        ylab="Pearson Correlation",
        main="Test Correlations")


pander::pander(t.test(corresults$testDeCorrelations,corresults$testRawCorrelations,paired=TRUE))
Paired t-test: corresults$testDeCorrelations and corresults$testRawCorrelations
Test statistic df P value Alternative hypothesis mean difference
3.92 29 0.000501 * * * two.sided 0.00433

sylim <- c(1,min(c(20,max(corresults$VIFRaw))))
vioplot(list(raw=corresults$VIFRaw,ILAA=corresults$VIFDe),
        ylab="VIF",
        ylim=sylim,
        main="Test VIF")



pander::pander(summary(cbind(raw=corresults$VIFRaw,ILAA=corresults$VIFDe)))
raw ILAA
Min. : 88.16 Min. :3.160
1st Qu.: 98.44 1st Qu.:3.757
Median : 99.86 Median :3.853
Mean : 98.02 Mean :4.191
3rd Qu.:100.60 3rd Qu.:3.967
Max. :102.27 Max. :9.434
summary(corresults$VIFRaw)

Min. 1st Qu. Median Mean 3rd Qu. Max. 88.16 98.44 99.86 98.02 100.60 102.27

1.2.11 The t-values


par(op)
par(mfrow=c(1,2),cex=0.5)
plot(corresults$rawtValues,
     main="Raw: Univariate t-values vs Model t-values",
     xlab="Univariate",
     ylab="Model",
     xlim=xvals,
     ylim=xvals)

lmtvals <- lm(Model~.,corresults$rawtValues)
pred <- lmtvals$coefficients[1] + lmtvals$coefficients[2] * xvals
lines(x=xvals,y=pred,col="red")
text(xvals[1]+(xvals[2]-xvals[1])/2,xvals[2]-1,sprintf("Slope= %.2f",lmtvals$coefficients[2]))

pander::pander(summary(lmtvals))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.386 0.07064 -5.47 5.41e-08
Uni 0.196 0.00961 20.40 2.16e-81
Fitting linear model: Model ~ .
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1426 2.6 0.226 0.226

plot(corresults$detValues,
      main="ILAA: Univariate t-values vs Model t-values",
     xlab="Univariate",
     ylab="Model",
     xlim=xvals,
     ylim=xvals)

lmtvals <- lm(Model~.,corresults$detValues)
pred <- lmtvals$coefficients[1] + lmtvals$coefficients[2] * xvals
lines(x=xvals,y=pred,col="red")
text(xvals[1]+(xvals[2]-xvals[1])/2,xvals[2]-1,sprintf("Slope= %.2f",lmtvals$coefficients[2]))


pander::pander(summary(lmtvals))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.879 0.05257 -16.7 2.96e-57
Uni 0.602 0.00908 66.3 0.00e+00
Fitting linear model: Model ~ .
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1371 1.88 0.763 0.763

1.3 PCA, EFA, PLS, ERT


toPCA <- sapply(apply(dataframe,2,unique),length) >= 5 & colnames(dataframe) != outcome

pc <- prcomp(dataframe[,toPCA],center = TRUE,scale. = TRUE,tol=0.01)   #principal components

if (ncol(dataframe)<20)
{
pander::pander(as.data.frame(pc$rotation),caption="PCA")
}

rotstd <- log10(abs(100*pc$rotation)+1.0)
  gplots::heatmap.2(rotstd,
                    trace = "none",
                    dendrogram="none",
                    breaks=c(0,0.5,1,2,3),
#                    scale="row",
                    mar = c(5,5),
                    col=rainbow(4),
                    main = "PCA Rotation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                    Rowv=FALSE,
                    Colv=FALSE,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="log(|100Rot|+1)",
                    xlab="Output Feature", ylab="Input Feature")



efa <- try(fa(dataframe[,toPCA],ncol(pc$rotation),rotate="varimax",warnings=FALSE))  # EFA analysis

if (!inherits(efa,"try-error"))
{
  if (ncol(dataframe)<20)
  {
   pander::pander(as.data.frame(efa$weights),caption="EFA")
  }
  rotstd <- log10(abs(100*efa$weights)+1.0)
    gplots::heatmap.2(rotstd,
                      trace = "none",
                      dendrogram="none",
                      breaks=c(0,0.5,1,2,3),
  #                    scale="row",
                      mar = c(5,5),
                      col=rainbow(4),
                      main = "EFA weights",
                      cexRow = cexheat,
                      cexCol = cexheat,
                      Rowv=FALSE,
                      Colv=FALSE,
                     srtCol=45,
                     srtRow=45,
                      key.title=NA,
                      key.xlab="log(|100W|+1)",
                      xlab="Output Feature", ylab="Input Feature")
}


  
  
plm <- plsr(formula=formula(paste(outcome,"~.")),data=dataframe,scale =TRUE)
if (ncol(dataframe)<20)
{
  lds <- plm$loadings
  lds2 <- matrix(as.numeric(lds),nrow=nrow(lds),ncol=ncol(lds))
  colnames(lds2) <- colnames(lds)
  rownames(lds2) <- rownames(lds)
  pander::pander(lds2,caption="PLS")
}

loadadings <- log10(abs(100*plm$loadings) + 1.0)
  gplots::heatmap.2(loadadings,
                    breaks=c(0,0.5,1,2,3),
                    trace = "none",
                    dendrogram="none",
#                    scale="row",
                    mar = c(5,5),
                    col=rainbow(4),
                    main = "PLS Loadings",
                    cexRow = cexheat,
                    cexCol = cexheat,
                    Rowv=FALSE,
                    Colv=FALSE,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="log(|100Beta|+1)",
                    xlab="Output Feature", ylab="Input Feature")



ERTmod <- ILAA(dataframe,Outcome = outcome,thr=thro)

ERT <- log10(abs(100*attr(ERTmod,"UPLTM")) + 1);
  gplots::heatmap.2(ERT,
                    trace = "none",
                    breaks=c(0,0.5,1,2,3),
                    mar = c(5,5),
                    col=rainbow(4),
                    main = "ERT Rotation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                    dendrogram="none",
                    Rowv=FALSE,
                    Colv=FALSE,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="log(|100Beta|+1)",
                    xlab="Output Feature", ylab="Input Feature")


if (ncol(dataframe)<20)
{
pander::pander(attr(ERTmod,"UPLTM"),caption="ERT")
}

1.4 U-MAP Visualization of features

1.4.1 The UMAP on Raw Data

  thesamples <- c(1:nrow(dataframe));
  if (nrow(dataframe)>2000) 
  {
    thesamples <- sample(thesamples,2000)
  }

  classes <- as.integer(scale(dataframe[thesamples,outcome]))
  classes <- classes - min(classes) + 1
  raincolors <- heat.colors(length(unique(classes)))
  dtatoplot <- as.matrix(FRESAScale(dataframe[thesamples,],method="OrderLogit")$scaledData)
  datasetframe.umap = umap(dtatoplot,n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: RAW",col=raincolors[classes],pch=15)

  
    gplots::heatmap.2(dtatoplot,
                    trace = "none",
                    mar = c(5,5),
                    col=heat.colors(5),
                    main = "Raw",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="Z",
                    xlab="Feature", ylab="Subject")




  dtatoplot <- as.matrix(FRESAScale(predict(pc,dataframe[thesamples,]),method="OrderLogit")$scaledData)
  datasetframe.umap = umap(dtatoplot,n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: PCA",col=raincolors[classes],pch=15)


    gplots::heatmap.2(dtatoplot,
                    trace = "none",
                    mar = c(5,5),
                    col=heat.colors(5),
                    main = "PCA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="Z",
                    xlab="Feature", ylab="Subject")


  
if (!inherits(efa,"try-error"))
{
  dtatoplot <- as.matrix(FRESAScale(predict(efa,dataframe[thesamples,toPCA]),method="OrderLogit")$scaledData)
  datasetframe.umap = umap(dtatoplot,n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: EFA",col=raincolors[classes],pch=15)
    gplots::heatmap.2(dtatoplot,
                    trace = "none",
                    mar = c(5,5),
                    col=heat.colors(5),
                    main = "EFA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="Z",
                    xlab="Feature", ylab="Subject")
}

  rotframe <- as.matrix(scale(dataframe[thesamples,rownames(plm$loadings)])) %*% plm$loadings
  
  dtatoplot <- as.matrix(FRESAScale(rotframe,method="OrderLogit")$scaledData)
  datasetframe.umap = umap(dtatoplot,n_components=2)
  
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: PLS",col=raincolors[classes],pch=15)

      gplots::heatmap.2(dtatoplot,
                    trace = "none",
                    mar = c(5,5),
                    col=heat.colors(5),
                    main = "PLS",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="Z",
                    xlab="Feature", ylab="Subject")



  dtatoplot <- as.matrix(FRESAScale(ERTmod[thesamples,colnames(ERTmod) != outcome],method="OrderLogit")$scaledData)
  datasetframe.umap = umap(dtatoplot,n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: ERT",col=raincolors[classes],pch=15)

      gplots::heatmap.2(dtatoplot,
                    trace = "none",
                    mar = c(5,5),
                    col=heat.colors(5),
                    main = "ERT",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="Z",
                    xlab="Feature", ylab="Subject")

1.5 The Age plots


plot(predOutcome,testingset[,outcome],xlab="Raw Predicted",ylab=outcome,main="Raw: Age Prediction")

plot(predOutcome_DE,testingset[,outcome],xlab="IDeA Predicted",ylab=outcome,main="IDeA Age Prediction")

1.6 Descriptors



pander::pander(demographiDesc[demographiDesc$`Variable Name` %in% colnames(transform),c(1,2)])
  Variable Name Variable Description
46 WTINT2YR Full sample 2 year interview weight.
47 WTMEC2YR Full sample 2 year MEC exam weight.
pander::pander(examinationDesc[examinationDesc$`Variable Name` %in% colnames(transform),c(1,2)])
  Variable Name Variable Description
12 BPXML1 MIL: maximum inflation levels (mm Hg)
109 BMXARMC Arm Circumference (cm)
110 BMXARML Upper Arm Length (cm)
111 BMXBMI Body Mass Index (kg/m**2)
113 BMXHT Standing Height (cm)
114 BMXLEG Upper Leg Length (cm)
120 BMXWAIST Waist Circumference (cm)
121 BMXWT Weight (kg)
175 MGDCGSZ Combined grip strength (kg): the sum of the largest reading from each hand.
185 MGXH1T1 Grip strength (kg), hand 1, test 1
187 MGXH1T2 Grip strength (kg), hand 1, test 2
189 MGXH1T3 Grip strength (kg), hand 1, test 3
191 MGXH2T1 Grip strength (kg), hand 2, test 1
193 MGXH2T2 Grip strength (kg), hand 2, test 2
195 MGXH2T3 Grip strength (kg), hand 2, test 3
212 OHX04TC Tooth Count: Upper right 2nd bicuspid/2nd primary molar (2B)
219 OHX06TC Tooth Count: Upper right cuspid (C)
236 OHX11TC Tooth Count: Upper left cuspid (C)
244 OHX13TC Tooth Count: Upper left 2nd bicuspid/2nd primary molar (2B)
266 OHX20TC Tooth Count: Lower left 2nd bicuspid/2nd primary molar (2B)
282 OHX25TC Tooth Count: Lower right central incisor (CI)
296 OHX29TC Tooth Count: Lower right 2nd bicuspid/2nd primary molar (2B)
pander::pander(questionnaireDesc[questionnaireDesc$`Variable Name` %in% colnames(transform),c(1,2)])
Variable Name Variable Description
pander::pander(labsDesc[labsDesc$`Variable Name` %in% colnames(transform),c(1,2)])
  Variable Name Variable Description
5 LBDBANO Basophils number (1000 cells/uL)
6 LBDEONO Eosinophils number (1000 cells/uL)
7 LBDLYMNO Lymphocyte number (1000 cells/uL)
8 LBDMONO Monocyte number (1000 cells/uL)
9 LBDNENO Segmented neutrophils num (1000 cell/uL)
10 LBXBAPCT Basophils percent (%)
11 LBXEOPCT Eosinophils percent (%)
12 LBXHCT Hematocrit (%)
13 LBXHGB Hemoglobin (g/dL)
14 LBXLYPCT Lymphocyte percent (%)
15 LBXMC Mean cell hemoglobin concentration (g/dL)
16 LBXMCHSI Mean cell hemoglobin (pg)
17 LBXMCVSI Mean cell volume (fL)
18 LBXMOPCT Monocyte percent (%)
19 LBXMPSI Mean platelet volume (fL)
20 LBXNEPCT Segmented neutrophils percent (%)
21 LBXPLTSI Platelet count (1000 cells/uL)
22 LBXRBCSI Red blood cell count (million cells/uL)
24 LBXWBCSI White blood cell count (1000 cells/uL)
26 LBDHDD Direct HDL-Cholesterol (mg/dL)
27 LBDHDDSI Direct HDL-Cholesterol (mmol/L)
29 LBDTCSI Total Cholesterol( mmol/L)
30 LBXTC Total Cholesterol( mg/dL)
1413 LBXHCT Hydroxycotinine, Serum (ng/mL)
1452 URDACT Albumin creatinine ratio (mg/g)
1453 URXCRS Creatinine, urine (umol/L)
1455 URXUMA Albumin, urine (ug/mL)
1456 URXUMS Albumin, urine (mg/L)